Film Bang Directory

Jupyter Notebook for computing and displaying basic figures

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
In [2]:
plt.rcParams['figure.figsize'] = [12, 6]
plt.style.use('fivethirtyeight')
In [3]:
x = str
df = pd.read_csv('Core_Data/Film_Bang_Personnel_Master_Step_2.csv', dtype={'Trainee prof':x, '2020': x, '2019': x, '2018': x,
       '2017': x, '2016': x, '2015': x, '2014': x, '2013': x, '2012': x, '2011': x, '2010': x, '2009': x,
       '2008': x, '2007': x, '2006': x, '2005': x, '2004': x, '2003': x, '2002': x, '2001': x, '2000': x,
       '1999': x, '1998': x, '1997': x, '1996': x, '1995': x, '1994': x, '1993': x, '1992': x, '1991': x,
       '1990': x, '1989': x, '1988': x, '1987': x, '1986': x, '1984': x, '1982': x, '1981': x,
       '1979': x, '1978': x, '1976': x})
In [4]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 300)

The code below brings basic analysis into one document and shows the output of various calculations. The anonymisation of the data has been carried out in a separate notebook so names, full postcodes, locations etc are removed. We're left with the first 3 characters of the 'last known' postcode where entries have one.

This notebook will have other subsidiary notebooks added to it over the next few days (3/10/20)

Index:

  • Basic Description of Data
  • Departments
  • Location
  • Trainee Program
  • Longevity

Number of Entries, Columns

In [5]:
df.shape
Out[5]:
(3634, 98)
In [6]:
df.columns
Out[6]:
Index(['UUID', 'Codes', 'Trainee prog', 'Gender', 'Role 1', 'Role 1 Category',
       'Role 2', 'Role 2 Category', 'Role 3', 'Role 3 Category',
       'Has Consistent Role', 'Postcode1', 'Rural', 'No of Yrs', '2020',
       '2019', '2018', '2017', '2016', '2015', '2014', '2013', '2012', '2011',
       '2010', '2009', '2008', '2007', '2006', '2005', '2004', '2003', '2002',
       '2001', '2000', '1999', '1998', '1997', '1996', '1995', '1994', '1993',
       '1992', '1991', '1990', '1989', '1988', '1987', '1986', '1984', '1982',
       '1981', '1979', '1978', '1976', 'Company', 'Description', 'Yr Cnt 2020',
       'Yr Cnt 2019', 'Yr Cnt 2018', 'Yr Cnt 2017', 'Yr Cnt 2016',
       'Yr Cnt 2015', 'Yr Cnt 2014', 'Yr Cnt 2013', 'Yr Cnt 2012',
       'Yr Cnt 2011', 'Yr Cnt 2010', 'Yr Cnt 2009', 'Yr Cnt 2008',
       'Yr Cnt 2007', 'Yr Cnt 2006', 'Yr Cnt 2005', 'Yr Cnt 2004',
       'Yr Cnt 2003', 'Yr Cnt 2002', 'Yr Cnt 2001', 'Yr Cnt 2000',
       'Yr Cnt 1999', 'Yr Cnt 1998', 'Yr Cnt 1997', 'Yr Cnt 1996',
       'Yr Cnt 1995', 'Yr Cnt 1994', 'Yr Cnt 1993', 'Yr Cnt 1992',
       'Yr Cnt 1991', 'Yr Cnt 1990', 'Yr Cnt 1989', 'Yr Cnt 1988',
       'Yr Cnt 1987', 'Yr Cnt 1986', 'Yr Cnt 1984', 'Yr Cnt 1982',
       'Yr Cnt 1981', 'Yr Cnt 1979', 'Yr Cnt 1978', 'Yr Cnt 1976'],
      dtype='object')
In [7]:
# Display of sample of data
df[['Trainee prog', 'Gender', 'Role 1', 'Role 1 Category', 'Rural', 'No of Yrs']].head(20)
Out[7]:
Trainee prog Gender Role 1 Role 1 Category Rural No of Yrs
0 NaN Unknown Location Scout Production NaN 1.0
1 NaN Male Camera (Lighting) Camera NaN 3.0
2 NaN Male Editor Post-Production NaN 3.0
3 NaN Female Costume Designer Costume NaN 22.0
4 NaN Male Sound Boom Operator Camera NaN 4.0
5 NaN Female Camera 2nd Assistant Camera NaN 1.0
6 NaN Male Camera (Lighting) Camera NaN 3.0
7 NaN Unknown Stunt Co-Ordinator Casting NaN 2.0
8 NaN Unknown Photographer Camera NaN 1.0
9 NaN Unknown Photographer Camera NaN 2.0
10 NaN Male Construction Construction NaN 13.0
11 NaN Male Runner Production NaN 1.0
12 NaN Male Assistant Editor Post-Production NaN 11.0
13 NaN Male Director Direction NaN 9.0
14 NaN Male Producer Producer NaN 7.0
15 NaN Male Petty Cash Buyer Art NaN 2.0
16 NaN Male Writer/Producer Script NaN 1.0
17 NaN Female Make-Up Assistant Hair & Make-Up NaN 2.0
18 NaN Female Unit Manager Production NaN 4.0
19 NaN Male Location Manager Production NaN 2.0

Basic Shape of the Data

In [113]:
# plot_years = df.loc[:, '2020':'1976'].columns
# print(plot_years)
# Data
plot_years = [2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013,2012,2011,2010,2009,2008,2007,2006,2005, 2004, 2003, 2002, 2001, 2000, 1999, 1998, 1997, 1996, 1995, 1994, 1993, 1992, 1991, 1990, 1989,1988,1987,1986,1984,1982,1981,1979,1978,1976]

totals = []
for column in df.loc[:, '2020':'1976']:
    tally = df[column].value_counts(dropna=True)
    totals.append(tally[0])
print(totals)

Growth & Decline

In [120]:
x_indexes = list(range(len(years_x)))
x_indexes.reverse()

plt.plot(x_indexes, totals, color='#2E0014', label='Total Entries')
plt.xlabel('Years - ommitting 77, 80, 83, 85')
plt.ylabel('Number of Entries')
plt.title('Basic Growth and Decline')

plt.xticks(ticks=x_indexes, labels=years_x, rotation=90)

plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Historical_Events/basic_growth_decline.png')

plt.show()

Periods of Growth and Decline

In [122]:
x_indexes = list(range(len(plot_years)))
x_indexes.reverse()

plt.plot(x_indexes, totals, color='#2E0014', label='Total Entries')
plt.xlabel('Years - ommitting 77, 80, 83, 85')
plt.ylabel('Number of Entries')
plt.title('Periods of Growth and Decline')
plt.xticks(ticks=x_indexes, labels=plot_years, rotation=90)

plt.axvspan(x_indexes[40], x_indexes[30], facecolor='orange', alpha=0.5, label='Development of Screen Industry in Scotland')
plt.axvspan(x_indexes[30], x_indexes[20], facecolor='magenta', alpha=0.5, label='Deregulation, changing technology, flexible working')
plt.axvspan(x_indexes[20], x_indexes[10], facecolor='blue', alpha=0.5, label='Decline of industry')
plt.axvspan(x_indexes[10], x_indexes[0], facecolor='green', alpha=0.5, label='Increased Production, New Broadcasters')

plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Historical_Events/timeframes.png')
plt.show()

Total Entries per Year

In [117]:
width=0.8
plt.bar(plot_years, totals, color='magenta', width = width, label='Total Workers Per Year')

plt.xlabel('Years')
plt.ylabel('No. of Entries')
plt.title('Entries Per Year')
plt.xticks(ticks=plot_years, rotation=90)

plt.legend()
plt.tight_layout()
plt.savefig('Outputs/Basic/entries_per_year.png')
plt.show()
[688, 644, 636, 627, 634, 675, 627, 668, 664, 692, 639, 726, 708, 759, 786, 793, 806, 782, 728, 710, 713, 676, 651, 585, 535, 492, 476, 397, 361, 355, 313, 266, 206, 210, 197, 200, 119, 104, 85, 83, 66]

Showing Growth Alongside Historical Events

In [119]:
x_indexes = list(range(len(plot_years)))
x_indexes.reverse()

plt.plot(x_indexes, totals, color='#2E0014', label='Total Entries')

plt.xlabel('Years - ommitting 77, 80, 83, 85')
plt.ylabel('Number of Entries')
plt.title('Historical Events and Number of Entries')
plt.xticks(ticks=x_indexes, labels=plot_years, rotation=90)
plt.axvline(x=x_indexes[36], linewidth=2, color='blue', label='Channel 4 Starts')
plt.axvline(x=x_indexes[24], linewidth=2, color='orange', label='Trainspotting')
plt.axvline(x=x_indexes[21], linewidth=2, color='green', label='Tartan Shorts Launched')
plt.axvline(x=x_indexes[16], linewidth=2, color='cyan', label='STV Crisis')
plt.axvline(x=x_indexes[12], linewidth=2, color='red', label='Financial Crisis')
plt.axvline(x=x_indexes[1], linewidth=2, color='magenta', label='Film Bang Website Relaunched')

plt.legend()
plt.grid(True)
plt.tight_layout()

plt.savefig('Outputs/Historical_Events/workers_by_year_key_dates.png')
plt.show()
In [ ]:
 

Departments

Figures for number of people in each department. Departments were allocated according to keywords found in role descriptions.

In [8]:
df['Role 1 Category'].value_counts()
Out[8]:
Production         816
Camera             670
Art                480
Producer           263
Direction          261
Post-Production    242
Hair & Make-Up     201
Costume            154
Sound              152
Support             86
Music               86
Script              79
Casting             64
Construction        51
Special FX          22
Trainee              6
Name: Role 1 Category, dtype: int64

Percentages for above figures

In [9]:
df['Role 1 Category'].value_counts(normalize=True).apply(lambda x: format((x*100), '.0f'))
Out[9]:
Production         22
Camera             18
Art                13
Producer            7
Direction           7
Post-Production     7
Hair & Make-Up      6
Costume             4
Sound               4
Support             2
Music               2
Script              2
Casting             2
Construction        1
Special FX          1
Trainee             0
Name: Role 1 Category, dtype: object
In [10]:
# Shows figures for those whose roles are consistent over the 3 columns (including empty values for 2 and 3), 
# and those where the roles differ by department
df['Has Consistent Role'].value_counts(dropna=False)
Out[10]:
True    3101
NaN      533
Name: Has Consistent Role, dtype: int64
In [11]:
# Percentages for the above
df['Has Consistent Role'].value_counts(normalize=True, dropna=False).apply(lambda x: format((x*100), '.0f'))
Out[11]:
True    85
NaN     15
Name: Has Consistent Role, dtype: object
In [12]:
filt = df.groupby(['Role 1 Category'])
filt['Has Consistent Role'].value_counts(dropna=False)
Out[12]:
Role 1 Category  Has Consistent Role
Art              True                   440
                 NaN                     40
Camera           True                   600
                 NaN                     70
Casting          True                    58
                 NaN                      6
Construction     True                    51
Costume          True                   140
                 NaN                     14
Direction        True                   161
                 NaN                    100
Hair & Make-Up   True                   179
                 NaN                     22
Music            True                    83
                 NaN                      3
Post-Production  True                   211
                 NaN                     31
Producer         True                   135
                 NaN                    128
Production       True                   750
                 NaN                     66
Script           True                    59
                 NaN                     20
Sound            True                   136
                 NaN                     16
Special FX       True                    17
                 NaN                      5
Support          True                    75
                 NaN                     11
Trainee          True                     6
Name: Has Consistent Role, dtype: int64
In [13]:
# Percentages of the above
filt = df.groupby(['Role 1 Category'])
filt['Has Consistent Role'].value_counts(normalize=True, dropna=False).apply(lambda x: format((x*100), '.0f'))
Out[13]:
Role 1 Category  Has Consistent Role
Art              True                    92
                 NaN                      8
Camera           True                    90
                 NaN                     10
Casting          True                    91
                 NaN                      9
Construction     True                   100
Costume          True                    91
                 NaN                      9
Direction        True                    62
                 NaN                     38
Hair & Make-Up   True                    89
                 NaN                     11
Music            True                    97
                 NaN                      3
Post-Production  True                    87
                 NaN                     13
Producer         True                    51
                 NaN                     49
Production       True                    92
                 NaN                      8
Script           True                    75
                 NaN                     25
Sound            True                    89
                 NaN                     11
Special FX       True                    77
                 NaN                     23
Support          True                    87
                 NaN                     13
Trainee          True                   100
Name: Has Consistent Role, dtype: object

In the above data we can see the departments Direction and Producer show the greatest instances of people having different roles across Role Categories 1, 2 & 3

In [14]:
# Data
depts = df['Role 1 Category'].dropna().unique().tolist()
nums = df['Role 1 Category'].value_counts().tolist()

# Plot
width=0.8
plt.bar(depts, nums, color='magenta', width = width, label='Entries')

plt.xlabel('Depts')
plt.ylabel('No. Entries')
plt.title('Film Bang Department Count')
plt.xticks(ticks=depts, rotation=90)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Department_Stats/department_count.png')
plt.show()

Production - Breakdown of this department given its size

In [314]:
filt = (df['Role 1 Category'] == 'Production')
production = df[filt]
#production
df_production = pd.DataFrame(production)
#print(df_production)
In [312]:
df_production.loc[production['Role 1'].str.contains('Unit|Director|ProductionAssistant|Engineer|Floor|Script|Autocue|Location|Property|Scout|Runner|Newcomer|Production Assistant|Stagehand|Stage Hand|Studio|Production Executive|Drapes'), 'Role 1 SubCategory'] = 'Location'
df_production.loc[production['Role 1'].str.contains('Line|Researcher|Production Manager|Secretary|Accountant|Payroll|Production Co-ordinator|Production Co-Ordinator|Office|Publicist'), 'Role 1 SubCategory'] = 'Admin'
df_production.loc[production['Role 1'].str.contains('Driver|Pilot|Transport'), 'Role 1 SubCategory'] = 'Transport'
In [313]:
# plt.style.use("fivethirtyeight")
# plt.rcParams['figure.figsize'] = [12, 6]

breakdown = production['Role 1 SubCategory'].value_counts().array
colors = ['#E49273', '#679436', '#A5BE00']
labels = ['Location', 'Admin', 'Transport']
plt.pie(breakdown,  labels=labels, labeldistance=1.2, colors=colors, shadow=True, 
        startangle=45, autopct='%1.0f%%', wedgeprops={'edgecolor':'black'})
plt.title('Production Sub Categories')
plt.tight_layout()
plt.savefig('Outputs/Department_Stats/production_sub_categories_pie_chart.png')
plt.show()
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 

Growth

In [126]:
# Create new df for storing filtered values
structure = {
    'Year':[],
    'Direction':[],
    'Music':[],
    'Producer':[],
    'Construction':[],
    'Costume':[],
    'Hair & Make-Up':[], 
    'Sound':[], 
    'Post-Production':[], 
    'Casting':[], 
    'Support':[], 
    'Art':[], 
    'Camera':[], 
    'Production':[], 
    'Script':[], 
    'Special FX':[] 
}

df_roles = pd.DataFrame(structure)

for column in df.loc[:, '2020':'1976']:
    # filter Role 1 Category for entries of 'Direction'
    filt1 = (df[column] == column) & (df['Role 1 Category'] == 'Direction')
    # apply filter
    direction = df[filt1]
    # count items in dataframe oject
    direction_count = len(direction.index)
    
    filt2 = (df[column] == column) & (df['Role 1 Category'] == 'Music')
    music = df[filt2]
    music_count = len(music.index)
    
    filt3 = (df[column] == column) & (df['Role 1 Category'] == 'Producer')
    producer = df[filt3]
    producer_count = len(producer.index)
    
    filt4 = (df[column] == column) & (df['Role 1 Category'] == 'Construction')
    construction = df[filt4]
    construction_count = len(construction.index)
    
    filt5 = (df[column] == column) & (df['Role 1 Category'] == 'Costume')
    costume = df[filt5]
    costume_count = len(costume.index)
    
    filt6 = (df[column] == column) & (df['Role 1 Category'] == 'Hair & Make-Up')
    hair = df[filt6]
    hair_count = len(hair.index)
    
    filt7 = (df[column] == column) & (df['Role 1 Category'] == 'Sound')
    sound = df[filt7]
    sound_count = len(sound.index)
    
    filt8 = (df[column] == column) & (df['Role 1 Category'] == 'Post-Production')
    postp = df[filt8]
    postp_count = len(postp.index)
    
    filt9 = (df[column] == column) & (df['Role 1 Category'] == 'Casting')
    casting = df[filt9]
    casting_count = len(casting.index)
    
    filt10 = (df[column] == column) & (df['Role 1 Category'] == 'Support')
    support = df[filt10]
    support_count = len(support.index)
    
    filt11 = (df[column] == column) & (df['Role 1 Category'] == 'Art')
    art = df[filt11]
    art_count = len(art.index)
    
    filt12 = (df[column] == column) & (df['Role 1 Category'] == 'Camera')
    camera = df[filt12]
    camera_count = len(camera.index)
    
    filt13 = (df[column] == column) & (df['Role 1 Category'] == 'Production')
    production = df[filt13]
    production_count = len(production.index)
    
    filt14 = (df[column] == column) & (df['Role 1 Category'] == 'Script')
    script = df[filt14]
    script_count = len(script.index)
    
    filt15 = (df[column] == column) & (df['Role 1 Category'] == 'Special FX')
    special = df[filt15]
    special_count = len(special.index)
  
    df_roles = df_roles.append({
        'Year': column, 
        'Direction': direction_count,
        'Music': music_count, 
        'Producer': producer_count, 
        'Construction': construction_count, 
        'Costume': costume_count, 
        'Hair & Make-Up': hair_count, 
        'Sound': sound_count, 
        'Post-Production': postp_count, 
        'Casting': casting_count, 
        'Support': support_count, 
        'Art': art_count, 
        'Camera': camera_count, 
        'Production': production_count, 
        'Script': script_count, 
        'Special FX': special_count 
    }, ignore_index=True)
#view output
df_roles
Out[126]:
Year Direction Music Producer Construction Costume Hair & Make-Up Sound Post-Production Casting Support Art Camera Production Script Special FX
0 2020 37.0 9.0 24.0 3.0 23.0 30.0 39.0 51.0 12.0 12.0 111.0 181.0 150.0 4.0 2.0
1 2019 38.0 7.0 25.0 2.0 23.0 34.0 34.0 48.0 10.0 12.0 106.0 173.0 124.0 4.0 4.0
2 2018 36.0 7.0 32.0 3.0 17.0 35.0 36.0 51.0 10.0 10.0 104.0 161.0 124.0 6.0 4.0
3 2017 33.0 8.0 33.0 3.0 21.0 29.0 42.0 57.0 9.0 10.0 88.0 174.0 108.0 8.0 4.0
4 2016 38.0 6.0 33.0 2.0 24.0 27.0 43.0 56.0 13.0 14.0 96.0 168.0 105.0 6.0 3.0
5 2015 43.0 9.0 37.0 2.0 26.0 35.0 43.0 65.0 11.0 11.0 96.0 169.0 119.0 7.0 2.0
6 2014 40.0 12.0 43.0 3.0 27.0 26.0 42.0 57.0 10.0 12.0 89.0 151.0 107.0 4.0 4.0
7 2013 41.0 10.0 49.0 3.0 24.0 40.0 45.0 59.0 9.0 11.0 95.0 164.0 113.0 1.0 4.0
8 2012 43.0 9.0 52.0 3.0 25.0 44.0 34.0 58.0 12.0 16.0 96.0 160.0 102.0 5.0 5.0
9 2011 50.0 9.0 51.0 4.0 29.0 41.0 38.0 59.0 10.0 15.0 99.0 157.0 116.0 9.0 5.0
10 2010 45.0 8.0 60.0 3.0 25.0 37.0 37.0 50.0 10.0 13.0 95.0 143.0 98.0 8.0 7.0
11 2009 48.0 6.0 66.0 6.0 25.0 52.0 41.0 54.0 13.0 16.0 117.0 158.0 112.0 9.0 3.0
12 2008 46.0 6.0 66.0 5.0 23.0 46.0 36.0 51.0 11.0 17.0 113.0 161.0 114.0 10.0 3.0
13 2007 62.0 10.0 66.0 6.0 30.0 51.0 42.0 49.0 15.0 14.0 113.0 163.0 121.0 14.0 3.0
14 2006 68.0 7.0 62.0 8.0 33.0 49.0 40.0 51.0 15.0 14.0 124.0 168.0 129.0 14.0 4.0
15 2005 65.0 9.0 74.0 8.0 42.0 49.0 44.0 53.0 17.0 12.0 125.0 162.0 115.0 16.0 2.0
16 2004 70.0 12.0 72.0 8.0 38.0 50.0 46.0 53.0 14.0 17.0 124.0 168.0 117.0 13.0 4.0
17 2003 71.0 8.0 76.0 8.0 37.0 48.0 41.0 48.0 12.0 13.0 120.0 170.0 111.0 15.0 4.0
18 2002 66.0 11.0 78.0 7.0 36.0 46.0 38.0 41.0 12.0 13.0 105.0 152.0 105.0 14.0 4.0
19 2001 56.0 12.0 81.0 9.0 39.0 42.0 37.0 44.0 8.0 15.0 103.0 141.0 105.0 15.0 3.0
20 2000 54.0 12.0 78.0 6.0 32.0 41.0 39.0 42.0 8.0 17.0 101.0 144.0 115.0 19.0 4.0
21 1999 56.0 13.0 66.0 12.0 36.0 35.0 36.0 42.0 9.0 12.0 91.0 138.0 108.0 18.0 3.0
22 1998 58.0 14.0 65.0 14.0 34.0 31.0 37.0 35.0 5.0 13.0 85.0 139.0 96.0 23.0 2.0
23 1997 52.0 11.0 58.0 8.0 24.0 28.0 36.0 30.0 4.0 10.0 76.0 137.0 90.0 21.0 0.0
24 1996 50.0 13.0 54.0 7.0 23.0 27.0 38.0 29.0 4.0 8.0 62.0 123.0 78.0 18.0 1.0
25 1995 48.0 11.0 59.0 7.0 17.0 21.0 33.0 32.0 4.0 12.0 49.0 110.0 72.0 15.0 2.0
26 1994 45.0 11.0 58.0 7.0 15.0 17.0 32.0 38.0 3.0 8.0 49.0 100.0 75.0 16.0 2.0
27 1993 42.0 9.0 53.0 7.0 14.0 11.0 28.0 33.0 2.0 6.0 34.0 89.0 55.0 13.0 1.0
28 1992 36.0 4.0 50.0 5.0 11.0 12.0 25.0 32.0 2.0 4.0 33.0 86.0 46.0 14.0 1.0
29 1991 29.0 6.0 45.0 4.0 15.0 15.0 30.0 32.0 0.0 4.0 26.0 88.0 47.0 13.0 1.0
30 1990 31.0 4.0 38.0 6.0 11.0 13.0 22.0 22.0 0.0 4.0 26.0 78.0 47.0 10.0 1.0
31 1989 23.0 5.0 29.0 6.0 12.0 11.0 19.0 18.0 0.0 4.0 28.0 70.0 31.0 8.0 2.0
32 1988 20.0 0.0 27.0 5.0 9.0 8.0 14.0 14.0 0.0 4.0 20.0 53.0 22.0 10.0 0.0
33 1987 20.0 0.0 28.0 3.0 9.0 4.0 16.0 16.0 0.0 4.0 20.0 58.0 22.0 10.0 0.0
34 1986 19.0 0.0 23.0 4.0 10.0 6.0 15.0 16.0 0.0 3.0 16.0 52.0 23.0 9.0 0.0
35 1984 17.0 0.0 29.0 6.0 8.0 5.0 12.0 16.0 0.0 2.0 21.0 53.0 21.0 8.0 0.0
36 1982 11.0 0.0 14.0 1.0 3.0 5.0 10.0 8.0 1.0 4.0 9.0 33.0 11.0 5.0 0.0
37 1981 10.0 0.0 16.0 1.0 3.0 7.0 9.0 4.0 0.0 3.0 9.0 28.0 8.0 5.0 0.0
38 1979 9.0 0.0 14.0 0.0 3.0 4.0 8.0 5.0 0.0 2.0 3.0 23.0 7.0 7.0 0.0
39 1978 9.0 0.0 12.0 0.0 2.0 6.0 9.0 4.0 0.0 0.0 4.0 21.0 8.0 8.0 0.0
40 1976 11.0 0.0 10.0 0.0 0.0 3.0 7.0 2.0 0.0 0.0 2.0 17.0 6.0 8.0 0.0
In [127]:
## Plot Setup
In [133]:
 
#colors = ['#064789', '#427AA1', '#E49273', '#679436', '#A5BE00', '#F7567C', '#243E36', '#A52422', '#E28413', '#2E0014', '#4E0250', '#F24333', '#345E56']
colors = ['red', 'green', 'magenta', 'orange','blue', 'pink','purple', 'yellow','cyan', 'lime', 'brown', 'grey', 'black']
In [134]:
years = df_roles['Year']
producers = df_roles['Producer']
construction = df_roles['Construction']
costume = df_roles['Costume']
hairs = df_roles['Hair & Make-Up']
sound = df_roles['Sound']
postp = df_roles['Post-Production']
casting = df_roles['Casting']
support = df_roles['Support']
art = df_roles['Art']
camera = df_roles['Camera']
production = df_roles['Production']
script = df_roles['Script']
special = df_roles['Special FX']

x_indexes = list(range(len(years)))
x_indexes.reverse()
In [135]:
plt.plot(x_indexes, producers, color=colors[0], label='Producers')
plt.plot(x_indexes, construction, color=colors[12], label='Construction')
plt.plot(x_indexes, costume, color=colors[1], label='Costume')
plt.plot(x_indexes, hairs, color=colors[2], label='Hair & Make-Up')
plt.plot(x_indexes, sound, color=colors[3], label='Sound')
plt.plot(x_indexes, postp, color=colors[4], label='Post-Production')
plt.plot(x_indexes, casting, color=colors[5], label='Casting')
plt.plot(x_indexes, support, color=colors[6], label='Support')
plt.plot(x_indexes, art, color=colors[7], label='Art')
plt.plot(x_indexes, camera, color=colors[8], label='Camera')
plt.plot(x_indexes, production, color=colors[9], label='Production')
plt.plot(x_indexes, script, color=colors[10], label='Script')
plt.plot(x_indexes, special, color=colors[11], label='Special FX')

plt.xlabel('Years')
plt.ylabel('No. Workers')
plt.title('Number in Department by Year')
plt.xticks(ticks=x_indexes, labels=years, rotation=90)

plt.legend()
plt.grid(True)
plt.tight_layout()

plt.savefig('Outputs/Department_Growth/number_in_dept_by_year.png')
plt.show()
In [139]:
plt.plot(x_indexes, camera, color=colors[8], label='Camera')
plt.plot(x_indexes, production, color=colors[9], label='Production')
plt.plot(x_indexes, art, color=colors[7], label='Art')
plt.plot(x_indexes, producers, color=colors[0], label='Producers')
plt.plot(x_indexes, postp, color=colors[4], label='Post-Production')
plt.plot(x_indexes, hairs, color=colors[2], label='Hair & Make-Up')
plt.plot(x_indexes, sound, color=colors[3], label='Sound')


plt.xlabel('Years')
plt.ylabel('No. Workers')
plt.title('Departments with Higher Growth')
plt.xticks(ticks=x_indexes, labels=years, rotation=90)

plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Department_Growth/departments_higher_growth.png')
plt.show()
In [140]:
plt.plot(x_indexes, construction, color=colors[12], label='Construction')
plt.plot(x_indexes, costume, color=colors[1], label='Costume')
plt.plot(x_indexes, casting, color=colors[5], label='Casting')
plt.plot(x_indexes, support, color=colors[6], label='Support')
plt.plot(x_indexes, script, color=colors[10], label='Script')
plt.plot(x_indexes, special, color=colors[11], label='Special FX')

plt.xlabel('Years')
plt.ylabel('No. Workers')
plt.title('Departments with Lower Growth')
plt.xticks(ticks=x_indexes, labels=years, rotation=90)

plt.legend()
plt.grid(True)
plt.tight_layout()

#plt.savefig('Outputs/Department_Growth/lower_growth_depts.png')
plt.show()

Location

Urban / Rural

The dataset is compromised in terms of location data. We take the last known postcode of entries that include a postcode and check the postcode against a list of rural postcodes.

In [15]:
# Where no postcode is given
df['Postcode1'].isna().value_counts()
Out[15]:
False    3104
True      530
Name: Postcode1, dtype: int64
In [16]:
# Number of Entries with Rural Postcode
df['Rural'].value_counts()
Out[16]:
True    155
Name: Rural, dtype: int64
In [17]:
# Entries with Urban Postcode / Entries with Rural Postcode
filt = df['Postcode1'].notna()
postcodes = df[filt]
postcodes['Rural'].value_counts(dropna=False)
Out[17]:
NaN     2949
True     155
Name: Rural, dtype: int64
In [18]:
# Percentage of Urban to Rural Postcodes 

filt = df['Postcode1'].notna()
postcodes = df[filt]
postcodes['Rural'].value_counts(normalize=True, dropna=False).apply(lambda x: format((x*100), '.0f'))
Out[18]:
NaN     95
True     5
Name: Rural, dtype: object
In [19]:
# Percentage of all postcode values inc nan to Rural Postcodes 
df['Rural'].value_counts(normalize=True, dropna=False).apply(lambda x: format((x*100), '.0f'))
Out[19]:
NaN     96
True     4
Name: Rural, dtype: object

Data for location is inconsistent and not tracked accurately over time. We've reduced this to a look at what is effectively a 'last known postcode area' for each entry.

In [20]:
df['Postcode1'].value_counts().head(20)
Out[20]:
G12     203
G20     144
G41     131
G11     124
G42     108
EH3      91
EH6      91
G        89
G51      63
EH7      57
EH       52
G31      51
G3 8     47
G61      46
EH10     45
EH1      43
EH4      37
EH9      36
G3 7     36
G3 6     36
Name: Postcode1, dtype: int64

Trainee Program

In [21]:
# Trainee data float to string
df['Trainee prog'] = df['Trainee prog'].fillna(-1)
df['Trainee prog'] = df['Trainee prog'].astype(int)
df['Trainee prog'] = df['Trainee prog'].astype(str)
df['Trainee prog'] = df['Trainee prog'].replace('-1', np.nan) 
In [22]:
df['Trainee prog'].count()
Out[22]:
116
In [23]:
# Show number of trainees on Trainee Program in a given year
df['Trainee prog'].value_counts()
Out[23]:
2007    12
2009     9
2004     8
2002     7
2006     7
1997     6
2000     6
1994     6
2019     5
1996     5
1991     4
1990     4
1993     4
1986     3
1989     3
2011     3
1978     3
1983     3
1982     3
2013     3
1987     3
1984     3
2015     2
1979     2
1981     1
1988     1
Name: Trainee prog, dtype: int64

Get Median Value for Numbers of Trainees

In [24]:
print('Average number of trainees over years the programs ran: ' + str(df['Trainee prog'].value_counts().median())
      + ' trainees')
Average number of trainees over years the programs ran: 3.5 trainees

Trainees self declaring as Trainees and NOT on a Trainee Program

In [25]:
# Filter Department for 'Trainee', return number of years in directory for each
filt = df['Role 1 Category'] == 'Trainee'
trainees = df[filt]
trainees['No of Yrs']
Out[25]:
75      1.0
461     1.0
468     1.0
627     1.0
1016    3.0
2537    1.0
Name: No of Yrs, dtype: float64

Length of career of trainees ordered by the year they became trainees

In [26]:
years = df.groupby(['Trainee prog'])
years['No of Yrs'].value_counts()
Out[26]:
Trainee prog  No of Yrs
1978          6.0          1
              36.0         1
              37.0         1
1979          7.0          1
              32.0         1
1981          29.0         1
1982          3.0          1
              26.0         1
              28.0         1
1983          3.0          1
              21.0         1
              36.0         1
1984          2.0          1
              14.0         1
              34.0         1
1986          17.0         1
              19.0         1
              34.0         1
1987          8.0          1
              12.0         1
              32.0         1
1988          7.0          1
1989          6.0          1
              10.0         1
              13.0         1
1990          2.0          1
              4.0          1
              9.0          1
              30.0         1
1991          1.0          1
              4.0          1
              5.0          1
              29.0         1
1993          11.0         2
              5.0          1
              7.0          1
1994          3.0          1
              4.0          1
              6.0          1
              7.0          1
              8.0          1
              22.0         1
1996          2.0          2
              10.0         1
              21.0         1
              23.0         1
1997          1.0          1
              3.0          1
              4.0          1
              7.0          1
              8.0          1
              16.0         1
2000          7.0          2
              1.0          1
              5.0          1
              8.0          1
              19.0         1
2002          2.0          3
              1.0          1
              3.0          1
              5.0          1
              11.0         1
2004          4.0          2
              7.0          2
              1.0          1
              3.0          1
              5.0          1
              6.0          1
2006          1.0          4
              3.0          2
              13.0         1
2007          1.0          3
              5.0          3
              6.0          2
              2.0          1
              3.0          1
              12.0         1
              13.0         1
2009          1.0          4
              2.0          2
              3.0          1
              4.0          1
              10.0         1
2011          1.0          1
              3.0          1
              4.0          1
2013          2.0          1
              8.0          1
              9.0          1
2015          3.0          2
2019          1.0          5
Name: No of Yrs, dtype: int64

New DataFrame from selected columns

In [27]:
trainees = df[['Trainee prog', 'No of Yrs', 'Gender', 'Role 1 Category']].dropna()
trainees = trainees.sort_values('Trainee prog')
trainees
Out[27]:
Trainee prog No of Yrs Gender Role 1 Category
1417 1978 36.0 Male Camera
1558 1978 6.0 Male Post-Production
2950 1978 37.0 Female Post-Production
337 1979 7.0 Male Direction
454 1979 32.0 Male Camera
381 1981 29.0 Male Production
3619 1982 26.0 Male Producer
2182 1982 3.0 Male Production
476 1982 28.0 Male Camera
207 1983 21.0 Female Camera
1939 1983 3.0 Male Post-Production
2635 1983 36.0 Male Camera
1493 1984 34.0 Male Sound
278 1984 14.0 Female Producer
2083 1984 2.0 Female Camera
480 1986 34.0 Female Post-Production
3570 1986 19.0 Male Sound
3532 1986 17.0 Female Camera
463 1987 8.0 Female Production
3067 1987 12.0 Female Support
3614 1987 32.0 Male Sound
2688 1988 7.0 Female Post-Production
1285 1989 6.0 Male Camera
585 1989 10.0 Male Sound
868 1989 13.0 Female Direction
1183 1990 4.0 Male Post-Production
1408 1990 2.0 Female Camera
2816 1990 9.0 Male Camera
3069 1990 30.0 Male Art
2062 1991 29.0 Male Camera
640 1991 4.0 Female Direction
586 1991 1.0 Female Production
2262 1991 5.0 Male Post-Production
2283 1993 11.0 Female Camera
1568 1993 11.0 Female Hair & Make-Up
2464 1993 7.0 Male Direction
1763 1993 5.0 Female Direction
268 1994 7.0 Female Production
2296 1994 4.0 Male Art
3415 1994 6.0 Male Camera
3573 1994 3.0 Female Direction
3581 1994 8.0 Male Art
363 1994 22.0 Male Post-Production
2046 1996 21.0 Male Post-Production
3583 1996 10.0 Female Production
262 1996 23.0 Female Camera
3077 1996 2.0 Female Costume
427 1996 2.0 Male Direction
2134 1997 16.0 Female Hair & Make-Up
2201 1997 3.0 Female Costume
932 1997 1.0 Female Post-Production
759 1997 4.0 Male Camera
1113 1997 8.0 Female Production
2429 1997 7.0 Male Art
1469 2000 5.0 Unknown Production
1632 2000 19.0 Male Camera
2333 2000 8.0 Female Post-Production
2798 2000 1.0 Male Production
633 2000 7.0 Female Art
1550 2000 7.0 Male Camera
3393 2002 2.0 Female Art
2519 2002 5.0 Female Producer
3428 2002 1.0 Female Direction
86 2002 11.0 Male Camera
1757 2002 2.0 Male Production
3609 2002 2.0 Female Production
1024 2002 3.0 Male Post-Production
3291 2004 5.0 Female Costume
937 2004 6.0 Male Camera
1735 2004 3.0 Male Production
3095 2004 7.0 Female Hair & Make-Up
2539 2004 4.0 Male Production
1798 2004 1.0 Male Production
2091 2004 7.0 Female Production
177 2004 4.0 Male Art
1190 2006 3.0 Male Camera
2753 2006 1.0 Female Production
1060 2006 1.0 Male Camera
2419 2006 1.0 Female Costume
1538 2006 13.0 Male Production
1328 2006 3.0 Male Production
2430 2006 1.0 Female Production
2795 2007 13.0 Female Production
1187 2007 6.0 Female Costume
3437 2007 3.0 Male Camera
2482 2007 1.0 Male Production
2002 2007 2.0 Female Production
1693 2007 5.0 Male Production
545 2007 5.0 Female Camera
584 2007 1.0 Female Production
829 2007 6.0 Male Post-Production
1775 2007 12.0 Male Camera
1002 2007 5.0 Female Art
2945 2007 1.0 Female Production
3537 2009 1.0 Female Art
2343 2009 3.0 Male Hair & Make-Up
588 2009 1.0 Female Production
382 2009 2.0 Female Production
2684 2009 1.0 Male Production
2628 2009 2.0 Male Camera
1029 2009 4.0 Female Production
2444 2009 10.0 Male Production
767 2009 1.0 Male Production
1138 2011 1.0 Female Art
442 2011 3.0 Male Production
198 2011 4.0 Unknown Production
1075 2013 9.0 Female Production
2749 2013 8.0 Female Direction
218 2013 2.0 Male Post-Production
935 2015 3.0 Female Production
3387 2015 3.0 Unknown Camera
2302 2019 1.0 Female Production
1833 2019 1.0 Male Production
832 2019 1.0 Female Costume
1948 2019 1.0 Male Camera
3010 2019 1.0 Male Camera

Trainee Gender

In [28]:
trainees['Gender'].value_counts()
Out[28]:
Male       59
Female     54
Unknown     3
Name: Gender, dtype: int64
In [29]:
trainees['Gender'].value_counts(normalize=True).apply(lambda x: format((x*100), '.0f'))
Out[29]:
Male       51
Female     47
Unknown     3
Name: Gender, dtype: object

Trainee Gender & Department

In [30]:
dept_grp = trainees.groupby(['Role 1 Category'])
dept_grp['Gender'].value_counts()
Out[30]:
Role 1 Category  Gender 
Art              Female      5
                 Male        5
Camera           Male       20
                 Female      7
                 Unknown     1
Costume          Female      6
Direction        Female      6
                 Male        3
Hair & Make-Up   Female      3
                 Male        1
Post-Production  Male        9
                 Female      5
Producer         Female      2
                 Male        1
Production       Female     19
                 Male       16
                 Unknown     2
Sound            Male        4
Support          Female      1
Name: Gender, dtype: int64
In [31]:
dept_grp = trainees.groupby(['Role 1 Category'])
dept_grp['Gender'].value_counts(normalize=True)
Out[31]:
Role 1 Category  Gender 
Art              Female     0.500000
                 Male       0.500000
Camera           Male       0.714286
                 Female     0.250000
                 Unknown    0.035714
Costume          Female     1.000000
Direction        Female     0.666667
                 Male       0.333333
Hair & Make-Up   Female     0.750000
                 Male       0.250000
Post-Production  Male       0.642857
                 Female     0.357143
Producer         Female     0.666667
                 Male       0.333333
Production       Female     0.513514
                 Male       0.432432
                 Unknown    0.054054
Sound            Male       1.000000
Support          Female     1.000000
Name: Gender, dtype: float64

Number of trainees in each department by year

In [32]:
year_grp = trainees.groupby(['Trainee prog'])
year_grp['Role 1 Category'].value_counts()
Out[32]:
Trainee prog  Role 1 Category
1978          Post-Production    2
              Camera             1
1979          Camera             1
              Direction          1
1981          Production         1
1982          Camera             1
              Producer           1
              Production         1
1983          Camera             2
              Post-Production    1
1984          Camera             1
              Producer           1
              Sound              1
1986          Camera             1
              Post-Production    1
              Sound              1
1987          Production         1
              Sound              1
              Support            1
1988          Post-Production    1
1989          Camera             1
              Direction          1
              Sound              1
1990          Camera             2
              Art                1
              Post-Production    1
1991          Camera             1
              Direction          1
              Post-Production    1
              Production         1
1993          Direction          2
              Camera             1
              Hair & Make-Up     1
1994          Art                2
              Camera             1
              Direction          1
              Post-Production    1
              Production         1
1996          Camera             1
              Costume            1
              Direction          1
              Post-Production    1
              Production         1
1997          Art                1
              Camera             1
              Costume            1
              Hair & Make-Up     1
              Post-Production    1
              Production         1
2000          Camera             2
              Production         2
              Art                1
              Post-Production    1
2002          Production         2
              Art                1
              Camera             1
              Direction          1
              Post-Production    1
              Producer           1
2004          Production         4
              Art                1
              Camera             1
              Costume            1
              Hair & Make-Up     1
2006          Production         4
              Camera             2
              Costume            1
2007          Production         6
              Camera             3
              Art                1
              Costume            1
              Post-Production    1
2009          Production         6
              Art                1
              Camera             1
              Hair & Make-Up     1
2011          Production         2
              Art                1
2013          Direction          1
              Post-Production    1
              Production         1
2015          Camera             1
              Production         1
2019          Camera             2
              Production         2
              Costume            1
Name: Role 1 Category, dtype: int64

Trainee Plots

In [33]:
# Data
trainee_years = trainees['Trainee prog'].unique()
trainee_numbers = trainees.groupby(['Trainee prog'])
trainee_numbers = trainee_numbers['Trainee prog'].value_counts().array

# Plot
width=0.7
plt.bar(trainee_years, trainee_numbers, color='magenta', width = width, label='Trainees')

plt.xlabel('Year of Traineeship')
plt.ylabel('No. of Trainees')
plt.title('Trainees')
plt.xticks(ticks=trainee_years, rotation=90)

plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Trainees/trainees_per_year.png')
plt.show()

Career Lengths & Longevity

Basic figures for Career Lengths

In [34]:
# Number of Years in Directory, number of entries present for that duration
df['No of Yrs'].value_counts()
Out[34]:
1.0     1120
2.0      530
3.0      355
4.0      247
5.0      217
6.0      128
7.0      121
8.0      101
9.0       97
10.0      85
11.0      68
13.0      54
12.0      54
15.0      51
14.0      47
17.0      35
18.0      31
16.0      31
22.0      30
19.0      27
21.0      27
20.0      26
24.0      21
25.0      19
26.0      17
29.0      14
23.0      14
28.0      11
30.0      10
27.0       9
34.0       7
35.0       7
32.0       6
31.0       4
33.0       4
36.0       3
37.0       2
38.0       1
41.0       1
Name: No of Yrs, dtype: int64

Career length grouped into 5 year bins

In [35]:
bins = pd.cut(df['No of Yrs'], [0, 5, 10, 15, 20, 25, 30, 35, 40, 45])
df.groupby(bins)['No of Yrs'].agg(['count'])
Out[35]:
count
No of Yrs
(0, 5] 2469
(5, 10] 532
(10, 15] 274
(15, 20] 150
(20, 25] 111
(25, 30] 61
(30, 35] 28
(35, 40] 6
(40, 45] 1
In [36]:
# Data
indexes = ['1 year','2-3 years', '4-6 years', '7-20 years', '20-41 years']
bins = pd.cut(df['No of Yrs'], [0, 1, 3, 6, 20, 41])
values = df.groupby(bins)['No of Yrs'].agg(['count'])
print(values)
           count
No of Yrs       
(0, 1]      1120
(1, 3]       885
(3, 6]       592
(6, 20]      828
(20, 41]     207
In [37]:
# Simplify Data
simple_values = (values.values)
value_list = [ item for elem in simple_values for item in elem]
# Plot
colors = sns.cubehelix_palette(start=2, rot=1, dark=0.5)
#colors = ['blue', 'magenta', 'orange', 'green', 'lime', 'cyan', 'yellow']
plt.pie(value_list,  labels=indexes, labeldistance=1.2, shadow=True, 
        startangle=90, autopct='%1.0f%%', colors=colors, wedgeprops={'edgecolor':'black'})
plt.title('% of entries with career of given length')
plt.tight_layout()
plt.savefig('Outputs/Longevity_Pie/longevity_more_bins_percentages.png')
plt.show()
In [ ]:
 
In [ ]:
 

Personnel in Directory for more than 1 year

In [38]:
filtered = df['No of Yrs'].apply(lambda x: x > 1)
print(filtered.value_counts())
print()
print('Percentages')
print(filtered.value_counts(normalize=True).apply(lambda x: format((x*100), '.0f')))
True     2512
False    1122
Name: No of Yrs, dtype: int64

Percentages
True     69
False    31
Name: No of Yrs, dtype: object

Personnel in Directory for more than 6 years

In [39]:
filtered = df['No of Yrs'].apply(lambda x: x > 6)
print(filtered.value_counts())
print()
print('Percentages')
print(filtered.value_counts(normalize=True).apply(lambda x: format((x*100), '.0f')))
False    2599
True     1035
Name: No of Yrs, dtype: int64

Percentages
False    72
True     28
Name: No of Yrs, dtype: object

Personnel in Directory for more than 15 years

In [40]:
filtered = df['No of Yrs'].apply(lambda x: x > 15)
print(filtered.value_counts())
print()
print('Percentages')
print(filtered.value_counts(normalize=True).apply(lambda x: format((x*100), '.0f')))
False    3277
True      357
Name: No of Yrs, dtype: int64

Percentages
False    90
True     10
Name: No of Yrs, dtype: object

Personnel in Directory for more than 20 years

In [41]:
filtered = df['No of Yrs'].apply(lambda x: x > 20)
print(filtered.value_counts())
print()
print('Percentages')
print(filtered.value_counts(normalize=True).apply(lambda x: format((x*100), '.0f')))
False    3427
True      207
Name: No of Yrs, dtype: int64

Percentages
False    94
True      6
Name: No of Yrs, dtype: object

Median Career Length All Entries

In [42]:
df['No of Yrs'].median()
Out[42]:
3.0

Median Career Length for Entries in for more than 1 Year

In [43]:
filt = (df['No of Yrs'] > 1)
df2 = df[filt]
df2['No of Yrs'].median()
Out[43]:
5.0
In [44]:
# Percentages
print('No Yrs  Percentage')
print(df['No of Yrs'].value_counts(normalize=True).apply(lambda x: format((x*100), '.0f')))
No Yrs  Percentage
1.0     31
2.0     15
3.0     10
4.0      7
5.0      6
6.0      4
7.0      3
8.0      3
9.0      3
10.0     2
11.0     2
13.0     1
12.0     1
15.0     1
14.0     1
17.0     1
18.0     1
16.0     1
22.0     1
19.0     1
21.0     1
20.0     1
24.0     1
25.0     1
26.0     0
29.0     0
23.0     0
28.0     0
30.0     0
27.0     0
34.0     0
35.0     0
32.0     0
31.0     0
33.0     0
36.0     0
37.0     0
38.0     0
41.0     0
Name: No of Yrs, dtype: object

Career Lengths by Department

In [45]:
# Showing Long Careers & Breakdown by Department
lengths = df.groupby(['No of Yrs'])
lengths['Role 1 Category'].value_counts().tail(107)
Out[45]:
No of Yrs  Role 1 Category
20.0       Art                6
           Producer           5
           Sound              4
           Direction          3
           Production         3
           Camera             2
           Costume            1
           Hair & Make-Up     1
           Post-Production    1
21.0       Camera             8
           Post-Production    4
           Art                3
           Producer           3
           Production         3
           Sound              3
           Costume            1
           Music              1
           Script             1
22.0       Art                7
           Camera             7
           Costume            3
           Production         3
           Direction          2
           Hair & Make-Up     2
           Post-Production    2
           Sound              2
           Casting            1
           Producer           1
23.0       Camera             5
           Art                2
           Production         2
           Direction          1
           Music              1
           Producer           1
           Script             1
           Sound              1
24.0       Camera             6
           Production         5
           Art                3
           Post-Production    2
           Costume            1
           Hair & Make-Up     1
           Producer           1
           Script             1
           Support            1
25.0       Art                5
           Sound              3
           Camera             2
           Hair & Make-Up     2
           Production         2
           Costume            1
           Direction          1
           Post-Production    1
           Producer           1
           Support            1
26.0       Camera             6
           Art                3
           Hair & Make-Up     3
           Direction          2
           Construction       1
           Producer           1
           Production         1
27.0       Camera             4
           Producer           2
           Direction          1
           Production         1
           Sound              1
28.0       Producer           3
           Camera             2
           Sound              2
           Art                1
           Post-Production    1
           Production         1
           Script             1
29.0       Camera             5
           Direction          3
           Production         2
           Hair & Make-Up     1
           Post-Production    1
           Producer           1
           Sound              1
30.0       Camera             5
           Costume            2
           Post-Production    2
           Art                1
31.0       Camera             2
           Post-Production    1
           Sound              1
32.0       Camera             3
           Sound              2
           Hair & Make-Up     1
33.0       Camera             2
           Sound              2
34.0       Camera             2
           Post-Production    2
           Art                1
           Direction          1
           Sound              1
35.0       Camera             3
           Art                2
           Costume            1
           Support            1
36.0       Camera             3
37.0       Art                1
           Post-Production    1
38.0       Sound              1
41.0       Camera             1
Name: Role 1 Category, dtype: int64
In [46]:
# Showing Short Careers Breakdown by Department Numbers
lengths = df.groupby(['No of Yrs'])
lengths['Role 1 Category'].value_counts().head(104)
Out[46]:
No of Yrs  Role 1 Category
1.0        Production         315
           Camera             181
           Art                153
           Post-Production     66
           Direction           65
           Hair & Make-Up      57
           Costume             51
           Producer            43
           Sound               42
           Music               35
           Support             34
           Construction        24
           Casting             21
           Script              17
           Special FX          11
           Trainee              5
2.0        Production         139
           Camera              90
           Art                 65
           Post-Production     36
           Producer            33
           Hair & Make-Up      32
           Direction           29
           Music               21
           Support             18
           Sound               17
           Casting             16
           Costume             14
           Script              13
           Construction         4
           Special FX           2
3.0        Production          92
           Camera              44
           Art                 36
           Producer            31
           Direction           30
           Hair & Make-Up      24
           Post-Production     22
           Costume             17
           Script              17
           Sound               11
           Support              9
           Casting              7
           Construction         7
           Music                5
           Special FX           2
           Trainee              1
4.0        Production          59
           Camera              47
           Art                 24
           Direction           22
           Producer            22
           Hair & Make-Up      17
           Post-Production     15
           Costume             14
           Music                7
           Script               6
           Sound                5
           Support              4
           Casting              3
           Construction         2
5.0        Production          50
           Camera              41
           Art                 35
           Producer            17
           Post-Production     15
           Direction           14
           Hair & Make-Up      12
           Costume             10
           Sound                9
           Music                5
           Casting              3
           Construction         2
           Support              2
           Script               1
           Special FX           1
6.0        Camera              25
           Production          24
           Art                 18
           Direction           10
           Post-Production     10
           Producer             9
           Sound                9
           Costume              8
           Hair & Make-Up       4
           Construction         3
           Script               3
           Music                2
           Support              2
           Casting              1
7.0        Production          22
           Camera              21
           Art                 18
           Direction           14
           Producer            13
           Hair & Make-Up       8
           Post-Production      8
           Costume              6
           Script               4
           Sound                3
           Casting              1
           Construction         1
           Music                1
           Special FX           1
Name: Role 1 Category, dtype: int64

Visualizing Career Lengths

The following two charts show Longevity in the directory with frequency of entries with the given 'career' lengths plotted along the x axis. We chart the pattern for all entries, and for entries who are present for more than 1 year. While entries lasting only 1 year are at first glance an anomaly, being by far the biggest single group in the dataset, but declaring the least information given their short presence, we see the same pattern (scaling down) with the removal of those entries from the chart.

In [148]:
# Data
keys_list = df['No of Yrs'].value_counts().index.tolist()
values = df['No of Yrs'].value_counts().values
median = df['No of Yrs'].median()

# Plot
width=0.8
color = sns.cubehelix_palette(start=3, dark=0.5)

plt.bar(keys_list, values, color=color, width = width, label='Crew')
plt.axvline(median, linewidth=2, color='blue', label='Median')

plt.xlabel('Years')
plt.ylabel('Number of Workers')
plt.title('Basic Longevity - Showing All Entries')
plt.xticks(ticks=keys_list)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Crew_Longevity_Bar_Charts/basic_crew_longevity.png')
plt.show()
In [145]:
# Data
filt = (df['No of Yrs'] > 1)
df2 = df[filt]

keys_list = df2['No of Yrs'].value_counts().index.tolist()
values = df2['No of Yrs'].value_counts().values
median = df2['No of Yrs'].median()

# Plot
width=0.8
color = sns.cubehelix_palette(start=1.5, dark=0.5)

plt.bar(keys_list, values, color=color, width = width, label='Crew')
plt.axvline(median, linewidth=2, color='blue', label='Median')

plt.xlabel('Years')
plt.ylabel('Number of Workers')
plt.title('Basic Longevity > 1 Year')
plt.xticks(ticks=keys_list)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Crew_Longevity_Bar_Charts/basic_crew_longevity>1yr.png')
plt.show()

Key window of career lengths: between 4 and 20 years

We are interested in the factors that allow people to develop longer careers. The following chart shows frequency of career lengths in the entries for between 4 and 20 years. This offers a wide sample of entries who are clearly getting something out of being in the directory

In [144]:
filt2 = (df['No of Yrs'] > 3 ) & (df['No of Yrs'] < 21)
df3 = df[filt2]

out = df3['No of Yrs'].value_counts()
keys_list = out.index.tolist()
values = df3['No of Yrs'].value_counts().values
median = df3['No of Yrs'].median()

color = sns.cubehelix_palette(start=2, dark=0.5)
width=0.8
plt.bar(keys_list, values, color=color, width = width, label='Crew')

plt.axvline(median, linewidth=2, color='blue', label='Median')

plt.xlabel('Years Presence in Film Bang')
plt.ylabel('No. of Crew')
plt.title('Longevity: Presence in FB 4-20 years')
plt.xticks(ticks=keys_list)

plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Crew_Longevity_Bar_Charts/basic_crew_longevity_4_20_yrs.png')
plt.show()

1 Year Entries - Their Departments

In [50]:
filt_1 = (df['No of Yrs'] < 2)
df_1 = df[filt_1]
msg = 'Number in department in for only 1 year, historical'
print(msg)
print(len(msg) *'-')
entries = []
for dept in depts: 
    filt_dept = (df_1['Role 1 Category'] == dept)
    df_depts = df_1[filt_dept] 
    count = df_depts['No of Yrs'].value_counts().values
    entries.append(count[0])
    print(dept, count)
print(entries)
Number in department in for only 1 year, historical
---------------------------------------------------
Production [315]
Camera [181]
Post-Production [66]
Costume [51]
Casting [21]
Construction [24]
Direction [65]
Producer [43]
Art [153]
Script [17]
Hair & Make-Up [57]
Special FX [11]
Music [35]
Support [34]
Sound [42]
Trainee [5]
[315, 181, 66, 51, 21, 24, 65, 43, 153, 17, 57, 11, 35, 34, 42, 5]
In [143]:
width=0.7
color = sns.cubehelix_palette(start=2, dark=0.5)
plt.bar(depts, entries, color=color, width = width, label='Crew')

plt.xlabel('Departments')
plt.ylabel('No. of Crew')
plt.title('In for 1 Year')
plt.xticks(ticks=depts, rotation=45)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Crew_Longevity_Bar_Charts/crew_in_for_1_year.png')
plt.show()

Long Careers In Film Bang

In [142]:
# Data
filt5 = (df['No of Yrs'] > 20 )
df4 = df[filt5]
keys_list = df4['No of Yrs'].value_counts().index.tolist()
values = df4['No of Yrs'].value_counts().values
# Plot
color = sns.cubehelix_palette(start=1, dark=0.5)
width=0.7
plt.bar(keys_list, values, color=color, width = width, label='Crew')

plt.xlabel('Years Presence in Film Bang')
plt.ylabel('No. of Crew')
plt.title('Longevity: Presence in FB over 20 years')
plt.xticks(ticks=keys_list)

plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Crew_Longevity_Bar_Charts/basic_crew_longevity_>20_yrs.png')
plt.show()

Career Lengths in Departments - Bins

In [53]:
import seaborn as sns
import matplotlib.pylab as plt

totals = df['Role 1 Category'].value_counts().array
depts = df['Role 1 Category'].value_counts().index.array
indexes = ['1 year','2-3 years', '4-6 years', '7-11 years', '12-18 years', '19-29 years', '30-42 years']

for index, dept in enumerate(depts):
    # set chart colour based on index 
    color = sns.cubehelix_palette(start=index, dark=0.5)
    # filter for dept in depts
    filt_dept = (df['Role 1 Category'] == dept)
    df_depts = df[filt_dept] 
    # set bins for aggregation
    bins = pd.cut(df_depts['No of Yrs'], [0, 1, 3, 6, 11, 18, 29, 42])
    values = df_depts.groupby(bins)['No of Yrs'].agg(['count'])
    # unpack the data 
    simple_values = (values.values)
    value_list = [ item for elem in simple_values for item in elem]
    # calculate median career length for given department
    median = str(df_depts['No of Yrs'].median())
    
    # Plot
    width = 0.7
    total = str(totals[index])
    label = 'Dept total = '+total+', \n Median= '+median+' years'
    plt.bar(indexes, value_list, color=color, width = width, label=label)
    plt.xlabel('Years Presence in Film Bang')
    plt.ylabel('Number of Entries')
    plt.title('Career lengths in ' + dept + ' dept. as of 2020')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig('Outputs/Crew_Longevity_Bar_Charts/crew_longevity_by_year_prime_bins_bar_'+dept+'.png')
    plt.show()
In [ ]:
# do bins of years: 
# breakdown by department for 30 - 41 years
# breakdown by department for 19-29 years etc
In [ ]:
 
In [ ]:
 

Heatmap of Career Longevity

Using the calculation for career length each year (a count produced for each year an entry is present in the directory) we chart career lengths over time

In [54]:
years = [*range(1,42)]
In [55]:
#initialise a temporary dataframe
df_master = pd.DataFrame({'Years': years})
# loop through year columns and filter out NAN values
for year in df.loc[:, '2020':'1976']:
    filtyear = df[year].notna()
    dfyear = df[filtyear]
    # get numbers
    values_year = dfyear['Yr Cnt '+year].value_counts().array
    keys_year = dfyear['Yr Cnt '+year].value_counts().index
    # put them in mini temp dataframe
    df_year = pd.DataFrame({f'{year} Keys':keys_year, f'{year} Values':values_year})
    # join the mini dataframes to the temporary dataframe
    df_master = df_master.join(df_year.set_index(f'{year} Keys'), on='Years')      
data = df_master.set_index('Years')
In [56]:
# Plot setup
import seaborn as sns
import matplotlib.pylab as plt

plot_years = df.loc[:, '2020':'1976']

colours = sns.cubehelix_palette(start=2, rot=1, dark=0, light=.95, as_cmap=True)
plt.figure(figsize=(22,10))
ax = sns.heatmap(data, linewidth=0.3, cmap=colours, annot=True, fmt=".0f")
plt.yticks(rotation=0) 
ax.xaxis.set_ticks_position('top')
ax.set_xticklabels(plot_years,rotation=90)
plt.xlabel('Years')
plt.ylabel('Length of Career in Years')
plt.title(f'Careers in Directory Over Time')
plt.savefig(f"Outputs/Heatmaps/career_longevity_basic_1.png")
plt.show()

Department Longevity

In [57]:
depts = df['Role 1 Category'].value_counts().index.array
In [58]:
df_master = pd.DataFrame({'Years': years})
In [59]:
import seaborn as sns
import matplotlib.pylab as plt

plot_years = df.loc[:, '2020':'1976']

def plot(dept):
    colours = sns.cubehelix_palette(start=2, rot=1, dark=0, light=.95, as_cmap=True)
    plt.figure(figsize=(22,10))
    ax = sns.heatmap(data, linewidth=0.3, cmap=colours, annot=True, fmt=".0f")
    plt.yticks(rotation=0) 
    ax.xaxis.set_ticks_position('top')
    ax.set_xticklabels(plot_years,rotation=90)
    plt.xlabel('Years')
    plt.ylabel('Length of Career in Years')
    plt.title(f'Careers in {dept} Department Over Time')
    plt.savefig(f"Outputs/Heatmaps/career_longevity_{dept}_2.png")
    plt.show()  
In [60]:
# Loop through departments
for i in depts:
    #initialise a new df_master dataframe for each pass through departments
    years = [*range(1,42)]
    df_master = pd.DataFrame({'Years': years})
    # filter for department of given iteration
    deptfilt = (df['Role 1 Category'] == i)
    df_dept = df[deptfilt]
    # loop through year columns and filter out NAN values
    for year in df.loc[:, '2020':'1976']:
        filtyear = df_dept[year].notna()
        dfyear = df_dept[filtyear]
        # get numbers
        values_year = dfyear['Yr Cnt '+year].value_counts().array
        keys_year = dfyear['Yr Cnt '+year].value_counts().index
        # put them in a new mini dataframe
        df_year = pd.DataFrame({f'{year} Keys':keys_year, f'{year} Values':values_year})
        # join the mini dataframes to a temporary master dataframe
        df_master = df_master.join(df_year.set_index(f'{year} Keys'), on='Years')      
    # assign the new master dataframe to data variable and plot it 
    # plot function will expect data to be the current dataframe
    df_master = df_master.set_index('Years')
    data = df_master
    plot(i)
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 

Gender

Gender is inferred. The Film Bang directory does not ask people to declare gender, or other personal information. The gender presented here is inferred from the names of entries (done during data capture, prior to anonymization of the data). Where gender is recorded here as 'unknown' it refers to a name that might conventionally refer to both a man or woman.

In [61]:
df['Gender'].value_counts()
Out[61]:
Male       1982
Female     1542
Unknown     110
Name: Gender, dtype: int64
In [62]:
df['Gender'].value_counts(normalize=True).apply(lambda x: format((x*100), '.0f'))
Out[62]:
Male       55
Female     42
Unknown     3
Name: Gender, dtype: object
In [63]:
role_grp = df.groupby(['Role 1 Category'])
role_grp['Gender'].value_counts().tail(300)
Out[63]:
Role 1 Category  Gender 
Art              Female     236
                 Male       224
                 Unknown     20
Camera           Male       567
                 Female      85
                 Unknown     18
Casting          Female      35
                 Male        26
                 Unknown      3
Construction     Male        51
Costume          Female     130
                 Male        18
                 Unknown      6
Direction        Male       167
                 Female      89
                 Unknown      5
Hair & Make-Up   Female     183
                 Male        15
                 Unknown      3
Music            Male        73
                 Female      11
                 Unknown      2
Post-Production  Male       175
                 Female      57
                 Unknown     10
Producer         Male       134
                 Female     125
                 Unknown      4
Production       Female     491
                 Male       300
                 Unknown     25
Script           Male        50
                 Female      28
                 Unknown      1
Sound            Male       133
                 Female      16
                 Unknown      3
Special FX       Male        17
                 Female       5
Support          Female      50
                 Male        27
                 Unknown      9
Trainee          Male         5
                 Female       1
Name: Gender, dtype: int64
In [99]:
# gender = []
# for column in df.loc[:, '2020':'1976']:
#     yr_gp = df.groupby([column])
#     year_data = yr_gp['Gender'].value_counts()
#     gender.append(year_data)

# print(gender)

# df_g = pd.DataFrame(gender)
# print(df_g)
In [100]:
ct = {
    'Year':[],
    'Male':[],
    'Female':[],
    'Unknown':[],
}
gender_ct = pd.DataFrame(ct)
In [101]:
for column in df.loc[:, '2020':'1976']:
      
    filt1 = (df[column] == column) & (df['Gender'] == 'Female')
    new_df = df[filt1]
    filt2 = (df[column] == column) & (df['Gender'] == 'Male')
    new_df2 = df[filt2]
    filt3 = (df[column] == column) & (df['Gender'] == 'Unknown')
    new_df3 = df[filt3]
    
    women = len(new_df.index)
    men = len(new_df2.index)
    no_gender = len(new_df3.index)
    gender_ct = gender_ct.append({'Year': column, 'Male': men,'Female': women, 'Unknown': no_gender}, ignore_index=True)
In [125]:
data = gender_ct
years_x = data['Year']
no_g_y = data['Unknown']
male_y = data['Male']
female_y = data['Female']

bars = np.add(male_y, female_y).tolist()
plt.bar(years_x, no_g_y, bottom=bars, color='orange', width=width, label='Gender Unknown')
plt.bar(years_x, male_y, bottom=female_y, color='green', width=width, label='Male')    
plt.bar(years_x, female_y, color='blue', width=width, label='Female')
    
plt.xlabel('Years')
plt.ylabel('No. Entries')
plt.title('Entire Directory by Gender')
plt.xticks(ticks=years_x, rotation=90)
    
plt.gca().invert_xaxis()
    
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("Outputs/Crew_Gender/basic_gender_chart.png")
plt.show()
In [104]:
# Numerical data on gender by year
print(gender_ct)
    Year   Male  Female  Unknown
0   2020  396.0   267.0     25.0
1   2019  372.0   255.0     17.0
2   2018  362.0   259.0     15.0
3   2017  381.0   234.0     12.0
4   2016  383.0   238.0     13.0
5   2015  407.0   255.0     13.0
6   2014  369.0   245.0     13.0
7   2013  389.0   264.0     15.0
8   2012  388.0   256.0     20.0
9   2011  407.0   267.0     18.0
10  2010  374.0   251.0     14.0
11  2009  414.0   294.0     18.0
12  2008  401.0   287.0     20.0
13  2007  418.0   324.0     17.0
14  2006  433.0   332.0     21.0
15  2005  440.0   330.0     23.0
16  2004  448.0   338.0     20.0
17  2003  447.0   317.0     18.0
18  2002  405.0   309.0     14.0
19  2001  389.0   303.0     18.0
20  2000  398.0   297.0     18.0
21  1999  379.0   286.0     11.0
22  1998  371.0   272.0      8.0
23  1997  334.0   249.0      2.0
24  1996  326.0   208.0      1.0
25  1995  303.0   188.0      1.0
26  1994  291.0   182.0      3.0
27  1993  251.0   145.0      1.0
28  1992  226.0   135.0      0.0
29  1991  226.0   129.0      0.0
30  1990  198.0   115.0      0.0
31  1989  177.0    89.0      0.0
32  1988  130.0    76.0      0.0
33  1987  146.0    64.0      0.0
34  1986  135.0    62.0      0.0
35  1984  147.0    53.0      0.0
36  1982   86.0    32.0      1.0
37  1981   78.0    24.0      2.0
38  1979   67.0    16.0      2.0
39  1978   65.0    18.0      0.0
40  1976   55.0    11.0      0.0
In [65]:
# Refactor this to break into smaller parts: data manipulation and plots. 
# Create plot function with a dept parameter
# Call plot function within the loop, passing the dept / column variable as argument. 

width = 0.7
gender_by_year = {
    'Year':[],
    'Male':[],
    'Female':[],
    'Unknown':[],
}
df_gender = pd.DataFrame(gender_by_year)

depts = df['Role 1 Category'].dropna().unique()
depts
for i in depts:
    df_gender = df_gender[0:0]
    filt = (df['Role 1 Category'] == i)
    department = df[filt]   
    for column in department.loc[:, '2020':'1976']:
        filt1 = (department[column] == column) & (department['Gender'] == 'Female')
        new_df = department[filt1]
        filt2 = (department[column] == column) & (department['Gender'] == 'Male')
        new_df2 = department[filt2]
        filt3 = (department[column] == column) & (department['Gender'] == 'Unknown')
        new_df3 = department[filt3]     
        women = len(new_df.index)
        men = len(new_df2.index)
        no_gender = len(new_df3.index)  
        df_gender = df_gender.append({'Year': column, 'Male': men,'Female': women, 'Unknown': no_gender}, ignore_index=True)

    data = df_gender
    years_x = data['Year']
    no_g_y = data['Unknown']
    male_y = data['Male']
    female_y = data['Female']

    bars = np.add(male_y, female_y).tolist()
    plt.bar(years_x, no_g_y, bottom=bars, color='#427AA1', width=width, label='Unknown')
    plt.bar(years_x, male_y, bottom=female_y, color='#679436', width=width, label='Male')    
    plt.bar(years_x, female_y, color='#E28413', width=width, label='Female')
    
    plt.xlabel('Years')
    plt.ylabel('No. Workers')
    plt.title(f'{i} Department Workers by Gender')
    plt.xticks(ticks=years_x, rotation=90)
    
    plt.gca().invert_xaxis()
    
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f"Outputs/Departments_Gender/{i}_department_workers_by_gender.png")
    plt.show()
In [ ]:
 

Career Length by Gender

In [223]:
g = df.groupby(['No of Yrs'])
g_count = g['Gender'].value_counts()
print(g_count)
No of Yrs  Gender 
1.0        Male       589
           Female     482
           Unknown     49
2.0        Male       283
           Female     227
           Unknown     20
3.0        Male       176
           Female     168
           Unknown     11
4.0        Male       130
           Female     109
           Unknown      8
5.0        Male       110
           Female     105
           Unknown      2
6.0        Male        75
           Female      50
           Unknown      3
7.0        Male        62
           Female      59
8.0        Male        64
           Female      34
           Unknown      3
9.0        Male        57
           Female      36
           Unknown      4
10.0       Male        51
           Female      34
11.0       Male        40
           Female      26
           Unknown      2
12.0       Male        32
           Female      21
           Unknown      1
13.0       Female      27
           Male        27
14.0       Female      24
           Male        22
           Unknown      1
15.0       Male        31
           Female      19
           Unknown      1
16.0       Male        21
           Female      10
17.0       Male        26
           Female       9
18.0       Male        20
           Female      11
19.0       Male        17
           Female      10
20.0       Male        16
           Female      10
21.0       Male        18
           Female       7
           Unknown      2
22.0       Male        17
           Female      12
           Unknown      1
23.0       Female       7
           Male         7
24.0       Female      11
           Male        10
25.0       Male        10
           Female       8
           Unknown      1
26.0       Male        10
           Female       7
27.0       Male         9
28.0       Male         9
           Female       2
29.0       Male         8
           Female       6
30.0       Male         8
           Female       2
31.0       Male         4
32.0       Male         5
           Female       1
33.0       Female       2
           Male         2
34.0       Male         5
           Female       2
35.0       Male         4
           Female       3
36.0       Male         3
37.0       Female       1
           Male         1
38.0       Male         1
41.0       Male         1
Name: Gender, dtype: int64

Data

In [210]:
# Create Dataframe with Yrs column
df_clg = pd.DataFrame({'Yrs': range(1, 43)})

# Filtering
m_filt = (df['Gender'] == 'Male')
df_m = df[m_filt]
f_filt = (df['Gender'] == 'Female')
df_f = df[f_filt]
u_filt = (df['Gender'] == 'Unknown')
df_u = df[u_filt]

# parse data
m_length = df_m.groupby(['No of Yrs'])
m_c_length_values = m_length['Gender'].value_counts().array
m_c_length_keys = m_length['Gender'].value_counts().index.tolist()
m_indexes = [i[0] for i in m_c_length_keys]

f_length = df_f.groupby(['No of Yrs'])
f_c_length_values = f_length['Gender'].value_counts().array
f_c_length_keys = f_length['Gender'].value_counts().index.tolist()
f_indexes = [i[0] for i in f_c_length_keys]

u_length = df_u.groupby(['No of Yrs'])
u_c_length_values = u_length['Gender'].value_counts().array
u_c_length_keys = u_length['Gender'].value_counts().index.tolist()
u_indexes = [i[0] for i in u_c_length_keys]

# Join Data

df_male = pd.DataFrame({'M Yrs':m_indexes, 'M Values':m_c_length_values})
df_clg = df_clg.join(df_male.set_index('M Yrs'), on='Yrs')  
df_female = pd.DataFrame({'F Yrs':f_indexes, 'F Values':f_c_length_values})
df_clg = df_clg.join(df_female.set_index('F Yrs'), on='Yrs') 
df_unknown = pd.DataFrame({'U Yrs':u_indexes, 'U Values':u_c_length_values})
df_clg = df_clg.join(df_unknown.set_index('U Yrs'), on='Yrs') 

print(df_clg)
    Yrs  M Values  F Values  U Values
0     1     589.0     482.0      49.0
1     2     283.0     227.0      20.0
2     3     176.0     168.0      11.0
3     4     130.0     109.0       8.0
4     5     110.0     105.0       2.0
5     6      75.0      50.0       3.0
6     7      62.0      59.0       NaN
7     8      64.0      34.0       3.0
8     9      57.0      36.0       4.0
9    10      51.0      34.0       NaN
10   11      40.0      26.0       2.0
11   12      32.0      21.0       1.0
12   13      27.0      27.0       NaN
13   14      22.0      24.0       1.0
14   15      31.0      19.0       1.0
15   16      21.0      10.0       NaN
16   17      26.0       9.0       NaN
17   18      20.0      11.0       NaN
18   19      17.0      10.0       NaN
19   20      16.0      10.0       NaN
20   21      18.0       7.0       2.0
21   22      17.0      12.0       1.0
22   23       7.0       7.0       NaN
23   24      10.0      11.0       NaN
24   25      10.0       8.0       1.0
25   26      10.0       7.0       NaN
26   27       9.0       NaN       NaN
27   28       9.0       2.0       NaN
28   29       8.0       6.0       NaN
29   30       8.0       2.0       NaN
30   31       4.0       NaN       NaN
31   32       5.0       1.0       NaN
32   33       2.0       2.0       NaN
33   34       5.0       2.0       NaN
34   35       4.0       3.0       NaN
35   36       3.0       NaN       NaN
36   37       1.0       1.0       NaN
37   38       1.0       NaN       NaN
38   39       NaN       NaN       NaN
39   40       NaN       NaN       NaN
40   41       1.0       NaN       NaN
41   42       NaN       NaN       NaN

Basic Plot

In [306]:
width = 0.3

yrs = df_clg['Yrs']
male = df_clg['M Values']
female = df_clg['F Values']
unknown = df_clg['U Values']

#bars = np.add(male_y, female_y).tolist()
plt.bar(yrs + width, unknown, color='orange', width=width, label='Gender Unknown', log=True)
plt.bar(yrs, male, color='green', width=width, label='Male',log=True)    
plt.bar(yrs - width, female, color='blue', width=width, label='Female',log=True)
    
plt.xlabel('Career Length in Years')
plt.ylabel('No. Entries - Log Scale')
plt.title('Career Length by Gender')
plt.xticks(ticks=yrs, rotation=90)
    
#plt.gca().invert_xaxis()
    
plt.legend(loc='upper right')
plt.grid(True)
plt.tight_layout()
plt.savefig("Outputs/Crew_Gender/gender_longevity_log.png")
plt.show()
In [296]:
bins = pd.cut(df_clg['Yrs'], [0, 1, 3, 6, 11, 18, 29, 42])
m_values = male.groupby(bins)
f_values = female.groupby(bins)
u_values = unknown.groupby(bins)
#print(m_values)
m_out = [i for i in m_values]
f_out = [i for i in f_values]
u_out = [i for i in u_values]
In [307]:
# Data
m_totals = []
f_totals = []
u_totals = []
for i in m_out:
    m_totals.append(i[1].sum())
for i in f_out:
    f_totals.append(i[1].sum())
for i in u_out:
    u_totals.append(i[1].sum())

# Refactor this to OOP plot (fig, axs) in order to overlay the 3 plots as subplots (allowing different bars in each bin)
# Plot
indexes = ['1 year','2-3 years', '4-6 years', '7-11 years', '12-18 years', '19-29 years', '30-42 years']
width = 0.3
#total = str(totals[index])
plt.bar(indexes, m_totals, color='orange', width=width, align='edge', label='Male')
plt.bar(indexes, f_totals, color='green', width=-width, align='edge', label='Female')
plt.bar(indexes, u_totals, color='blue', width=width,  label='Unknown Gender')
plt.xlabel('Year Bins')
plt.ylabel('Number of Entries')
plt.title('Career Length Bins')
plt.legend()
#plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Crew_Gender/longevity_gender_crew_bins.png')
plt.show()

Dropouts

See Turnover Dropouts New Entries Notebook

Other

In [ ]:
group by department, longevity
In [ ]:
 
In [ ]: